import scanpy as sc
import anndata as ad
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri
from scipy.io import mmread
from scipy.sparse import csr_matrix
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()
----- anndata 0.8.0 scanpy 1.9.3 ----- PIL 9.4.0 anndata2ri 1.1 anyio NA appnope 0.1.3 arrow 1.2.3 asttokens NA attr 21.2.0 babel 2.12.1 backcall 0.2.0 certifi 2022.12.07 cffi 1.15.1 charset_normalizer 3.1.0 comm 0.1.2 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 debugpy 1.6.6 decorator 5.1.1 defusedxml 0.7.1 executing 1.2.0 fastjsonschema NA fqdn NA gprofiler 1.0.0 h5py 3.8.0 idna 3.4 igraph 0.10.4 importlib_resources NA ipykernel 6.22.0 ipython_genutils 0.2.0 isoduration NA jedi 0.18.2 jinja2 3.1.2 joblib 1.2.0 json5 NA jsonpointer 2.3 jsonschema 4.17.3 jupyter_events 0.6.3 jupyter_server 2.5.0 jupyterlab_server 2.20.0 kiwisolver 1.4.4 leidenalg 0.9.1 llvmlite 0.39.1 louvain 0.8.0 markupsafe 2.1.2 matplotlib 3.7.1 mpl_toolkits NA natsort 8.3.1 nbformat 5.8.0 numba 0.56.4 numpy 1.23.5 packaging 23.0 pandas 1.5.3 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 3.1.1 prometheus_client NA prompt_toolkit 3.0.38 psutil 5.9.4 ptyprocess 0.7.0 pure_eval 0.2.2 pvectorc NA pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pyexpat NA pygments 2.14.0 pyparsing 2.4.7 pyrsistent NA pythonjsonlogger NA pytz 2022.7.1 pytz_deprecation_shim NA requests 2.28.2 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpy2 3.5.12 scipy 1.10.1 seaborn 0.12.2 send2trash NA session_info 1.0.0 six 1.16.0 sklearn 1.2.2 sniffio 1.3.0 stack_data 0.6.2 statsmodels 0.13.5 texttable 1.6.7 threadpoolctl 3.1.0 tornado 6.2 traitlets 5.9.0 tzlocal NA uri_template NA urllib3 1.26.15 wcwidth 0.2.6 webcolors 1.11.1 websocket 1.5.1 yaml 6.0 zipp NA zmq 25.0.2 ----- IPython 8.11.0 jupyter_client 8.1.0 jupyter_core 5.3.0 jupyterlab 3.6.2 notebook 6.5.3 ----- Python 3.9.5 (v3.9.5:0a7dcbdb13, May 3 2021, 13:17:02) [Clang 6.0 (clang-600.0.57)] macOS-10.16-x86_64-i386-64bit ----- Session information updated at 2023-07-13 05:19
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
.libPaths(.libPaths()[c(3,2,1)])
# Load all the R libraries we will be using in the notebook
library(scran)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(clusterExperiment)
library(ggplot2)
library(plyr)
library(MAST)
载入需要的程辑包:SingleCellExperiment
载入需要的程辑包:SummarizedExperiment
载入需要的程辑包:MatrixGenerics
载入需要的程辑包:matrixStats
载入程辑包:‘MatrixGenerics’
The following objects are masked from ‘package:matrixStats’:
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
载入需要的程辑包:GenomicRanges
载入需要的程辑包:stats4
载入需要的程辑包:BiocGenerics
载入程辑包:‘BiocGenerics’
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
tapply, union, unique, unsplit, which.max, which.min
载入需要的程辑包:S4Vectors
载入程辑包:‘S4Vectors’
The following object is masked from ‘package:utils’:
findMatches
The following objects are masked from ‘package:base’:
I, expand.grid, unname
载入需要的程辑包:IRanges
载入需要的程辑包:GenomeInfoDb
载入需要的程辑包:Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
载入程辑包:‘Biobase’
The following object is masked from ‘package:MatrixGenerics’:
rowMedians
The following objects are masked from ‘package:matrixStats’:
anyMissing, rowMedians
载入需要的程辑包:scuttle
载入需要的程辑包:princurve
载入需要的程辑包:TrajectoryUtils
载入程辑包:‘TrajectoryUtils’
The following object is masked from ‘package:scran’:
createClusterMST
载入需要的程辑包:Matrix
载入程辑包:‘Matrix’
The following object is masked from ‘package:S4Vectors’:
expand
载入需要的程辑包:ggplot2
载入需要的程辑包:VGAM
载入需要的程辑包:splines
载入需要的程辑包:DDRTree
载入需要的程辑包:irlba
载入程辑包:‘monocle’
The following object is masked from ‘package:scran’:
clusterCells
载入需要的程辑包:foreach
Loaded gam 1.22-2
载入程辑包:‘gam’
The following object is masked from ‘package:VGAM’:
s
载入程辑包:‘plyr’
The following object is masked from ‘package:IRanges’:
desc
The following object is masked from ‘package:S4Vectors’:
rename
The following object is masked from ‘package:matrixStats’:
count
# 1. Reading the data
# Data loading
# Files
sample_strings = ["Duo_M1","Duo_M2","Jej_M1","Jej_M2","Il_M1","Il_M2"]
sample_id_strings = ["3","4","5","6","7","8"]
file_base = "./GSE92332_RAW/GSM283657"
exp_string = "_Regional_"
data_end = "_matrix.mtx.gz"
barcode_end = "_barcodes.tsv.gz"
gene_end = "_genes.tsv.gz"
# load data set
# example of loading first data set
# Parse filenames
sample = sample_strings[0]
sample_id = sample_id_strings[0]
data_file = file_base + sample_id + exp_string + sample + data_end
barcode_file = file_base + sample_id + exp_string + sample + barcode_end
gene_file = file_base + sample_id + exp_string + sample + gene_end
# loading
adata = sc.read(data_file, cache=True)
adata = adata.transpose()
adata.X = adata.X.toarray()
barcodes = pd.read_csv(barcode_file, header = None, sep = '\t')
genes = pd.read_csv(gene_file, header = None, sep = "\t")
# barcodes.index.name = ''
# genes.index.name = ''
# genes.columns = ['gene_ids']
# # Annotate data
barcodes.rename(columns = {0:'barcode'}, inplace = True)
barcodes.set_index("barcode", inplace=True)
adata.obs = barcodes
adata.obs['sample'] = [sample]*adata.n_obs
adata.obs['region'] = [sample.split("_")[0]]*adata.n_obs
adata.obs['donor'] = [sample.split("_")[1]]*adata.n_obs
genes.rename(columns={0:"gene_id", 1:"gene_symbol"}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata.var = genes
adata.var_names_make_unique()
... reading from cache file cache/GSE92332_RAW-GSM2836573_Regional_Duo_M1_matrix.h5ad
adata.var
| gene_id | |
|---|---|
| gene_symbol | |
| mm10_Xkr4 | mm10_ENSMUSG00000051951 |
| mm10_Gm1992 | mm10_ENSMUSG00000089699 |
| mm10_Gm37381 | mm10_ENSMUSG00000102343 |
| mm10_Rp1 | mm10_ENSMUSG00000025900 |
| mm10_Rp1-1 | mm10_ENSMUSG00000109048 |
| ... | ... |
| mm10_AC168977.1 | mm10_ENSMUSG00000079808 |
| mm10_PISD | mm10_ENSMUSG00000095041 |
| mm10_DHRSX | mm10_ENSMUSG00000063897 |
| mm10_Vmn2r122 | mm10_ENSMUSG00000096730 |
| mm10_CAAA01147332.1 | mm10_ENSMUSG00000095742 |
27998 rows × 1 columns
# Loop to load all data set
for i in range(1,len(sample_strings)):
# Parse filenames
sample = sample_strings[i]
sample_id = sample_id_strings[i]
data_file = file_base + sample_id + exp_string + sample + data_end
barcode_file = file_base + sample_id + exp_string + sample + barcode_end
gene_file = file_base + sample_id + exp_string + sample + gene_end
# loading
adata_tmp = sc.read(data_file, cache=True)
adata_tmp = adata_tmp.transpose()
adata_tmp.X = adata_tmp.X.toarray()
barcodes_tmp = pd.read_csv(barcode_file, header = None, sep = '\t')
genes_tmp = pd.read_csv(gene_file, header = None, sep = "\t")
# Annotate data
barcodes_tmp.rename(columns = {0:'barcode'}, inplace = True)
barcodes_tmp.set_index("barcode", inplace=True)
adata_tmp.obs = barcodes_tmp
adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
genes_tmp.rename(columns={0:"gene_id", 1:"gene_symbol"}, inplace=True)
genes_tmp.set_index('gene_symbol', inplace=True)
adata_tmp.var = genes_tmp
adata_tmp.var_names_make_unique()
#Concatenate to main adata obj
adata = adata.concatenate(adata_tmp, batch_key=sample_id)
if 'gene_id-1' in adata.var.columns:
adata.var['gene_id'] = adata.var['gene_id-1']
adata.var.drop(columns = ['gene_id-1', 'gene_id-0'], inplace = True)
adata.obs.drop(columns = [sample_id],inplace = True)
adata.obs_names = [c.split('-')[0] for c in adata.obs_names]
adata.obs_names_make_unique(join = '_')
... reading from cache file cache/GSE92332_RAW-GSM2836574_Regional_Duo_M2_matrix.h5ad ... reading from cache file cache/GSE92332_RAW-GSM2836575_Regional_Jej_M1_matrix.h5ad
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
... reading from cache file cache/GSE92332_RAW-GSM2836576_Regional_Jej_M2_matrix.h5ad
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
... reading from cache file cache/GSE92332_RAW-GSM2836577_Regional_Il_M1_matrix.h5ad
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
... reading from cache file cache/GSE92332_RAW-GSM2836578_Regional_Il_M2_matrix.h5ad
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
# Remove the prefix
adata.var_names = [g.split("_")[1] for g in adata.var_names]
adata.var["gene_id"] = [g.split("_")[1] for g in adata.var["gene_id"]]
print(adata.obs['region'].value_counts())
print('')
print(adata.obs['donor'].value_counts())
print('')
print(adata.obs['sample'].value_counts())
Duo 4554 Jej 4496 Il 4303 Name: region, dtype: int64 M2 6711 M1 6642 Name: donor, dtype: int64 Jej_M2 2668 Il_M1 2466 Duo_M1 2348 Duo_M2 2206 Il_M2 1837 Jej_M1 1828 Name: sample, dtype: int64
# # jeffery
# sc.pp.filter_cells(adata, min_genes= 200)
# sc.pp.filter_genes(adata, min_cells = 3)
# # annotate the group of mitochondrial genes as 'mt'
# adata.var['mt'] = adata.var_names.str.startswith("mt-")
# sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top = None, log1p=False, inplace = True)
# sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
# jitter = 0.4, multi_panel = True)
# QC
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)
mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:,mt_gene_mask].sum(1)/adata.obs['n_counts']
adata.obs.head()
| sample | region | donor | n_counts | log_counts | n_genes | mt_frac | |
|---|---|---|---|---|---|---|---|
| AAACATACAGCGGA | Duo_M1 | Duo | M1 | 12770.0 | 9.454854 | 1227 | 0.001410 |
| AAACATACCTTACT | Duo_M1 | Duo | M1 | 6587.0 | 8.792853 | 2156 | 0.010779 |
| AAACATACTTTGCT | Duo_M1 | Duo | M1 | 2844.0 | 7.952967 | 1424 | 0.033755 |
| AAACCGTGCAGTCA | Duo_M1 | Duo | M1 | 2799.0 | 7.937017 | 1362 | 0.022508 |
| AAACGCTGCAGTCA | Duo_M1 | Duo | M1 | 6050.0 | 8.707813 | 2287 | 0.015041 |
# plot QC metrics
t1 = sc.pl.violin(adata, 'n_counts', groupby= 'sample', size = 2, log = True,
cut = 0)
t2 = sc.pl.violin(adata, 'mt_frac', groupby = 'sample')
p1 = sc.pl.scatter(adata, 'n_counts','n_genes', color = 'mt_frac')
p2 = sc.pl.scatter(adata[adata.obs['n_counts'] < 10000], 'n_counts','n_genes', color = 'mt_frac')
p3 = sb.displot(adata.obs['n_counts'], kde=False)
p4 = sb.displot(adata.obs['n_counts'][adata.obs['n_counts']<4000], kde = False, bins =60)
p5 = sb.displot(adata.obs['n_counts'][adata.obs['n_counts']>10000], kde = False, bins = 60)
p6 = sb.displot(adata.obs['n_genes'], kde = False, bins = 60)
p7 = sb.displot(adata.obs['n_genes'][adata.obs['n_genes']<1000], kde = False, bins = 60)
# filter cells
sc.pp.filter_cells(adata, min_counts = 1500)
sc.pp.filter_cells(adata, max_counts = 40000)
sc.pp.filter_cells(adata, min_genes = 700)
adata = adata[adata.obs['mt_frac']<0.2]
# filter genes
sc.pp.filter_genes(adata, min_cells = 20)
filtered out 479 cells that have less than 1500 counts filtered out 16 cells that have more than 40000 counts filtered out 401 cells that have less than 700 genes expressed filtered out 15180 genes that are detected in less than 20 cells
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scanpy/preprocessing/_simple.py:251: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual. adata.var['n_cells'] = number
adata.X
array([[0., 0., 0., ..., 0., 1., 0.],
[1., 2., 0., ..., 2., 0., 0.],
[2., 0., 0., ..., 1., 0., 0.],
...,
[5., 2., 0., ..., 0., 2., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[5., 1., 0., ..., 0., 0., 0.]], dtype=float32)
# Keep count data in counts layer
adata.layers["counts"] = adata.X.copy()
#normalization
sc.pp.normalize_total(adata, target_sum= 1e4)
#logrithmize
sc.pp.log1p(adata)
normalizing counts per cell
finished (0:00:00)
adata.raw = adata
# batch correction
sc.pp.combat(adata, key = 'sample')
Standardizing Data across genes. Found 6 batches Found 0 numerical variables: Fitting L/S model and finding priors Finding parametric adjustments
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
Adjusting data
# find HVG
sc.pp.highly_variable_genes(adata, flavor = 'cell_ranger', n_top_genes = 4000)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
finished (0:00:03)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
sc.pl.highly_variable_genes(adata)
# visualization
# dimention reduction
sc.pp.pca(adata, n_comps = 50, use_highly_variable = True, svd_solver = 'arpack')
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:15)
sc.pl.pca_variance_ratio(adata, log = True, n_pcs = 50)
# non-linear dimention reduction
sc.pp.neighbors(adata, n_pcs = 40)
sc.tl.umap(adata)
computing neighbors
using 'X_pca' with n_pcs = 40
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:24)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:08)
sc.pl.umap(adata, color = "n_counts")
# clusting
sc.tl.louvain(adata, key_added = 'lv_r1')
sc.tl.louvain(adata, resolution = 0.5, key_added = 'lv_r0.5',)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 15 clusters and added
'lv_r1', the cluster labels (adata.obs, categorical) (0:00:01)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 8 clusters and added
'lv_r0.5', the cluster labels (adata.obs, categorical) (0:00:02)
adata.obs['lv_r0.5'].value_counts()
0 4393 1 2899 2 1775 3 1355 4 830 5 667 6 287 7 251 Name: lv_r0.5, dtype: int64
adata.obs['lv_r1'].value_counts()
0 1655 1 1598 2 1460 3 1406 4 1246 5 886 6 865 7 864 8 802 9 663 10 338 11 312 12 253 13 91 14 18 Name: lv_r1, dtype: int64
sc.tl.leiden(adata, key_added='ld_1')
sc.tl.leiden(adata, key_added = 'ld_0.5', resolution=0.5)
running Leiden clustering
finished: found 20 clusters and added
'ld_1', the cluster labels (adata.obs, categorical) (0:00:02)
running Leiden clustering
finished: found 9 clusters and added
'ld_0.5', the cluster labels (adata.obs, categorical) (0:00:01)
# compare louvain and leiden
sc.pl.umap(adata, color = ['ld_0.5', 'lv_r0.5'])
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
# calculate marker genes
sc.tl.rank_genes_groups(adata, groupby='lv_r0.5',key_added='rank_genes_r0.5')
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'
ranking genes
finished: added to `.uns['rank_genes_r0.5']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:14)
# plot marker genes
sc.pl.rank_genes_groups(adata, key = 'rank_genes_r0.5', groups = ['0','1','2'])
sc.pl.rank_genes_groups(adata, key = 'rank_genes_r0.5', groups = ['3','4','5'])
sc.pl.rank_genes_groups(adata, key = 'rank_genes_r0.5', groups = ['6','7'])
#Known marker genes:
marker_genes = dict()
marker_genes['Stem'] = ['Lgr5', 'Ascl2', 'Slc12a2', 'Axin2', 'Olfm4', 'Gkn3']
marker_genes['Enterocyte (Proximal)'] = ['Gsta1','Rbp2','Adh6a','Apoa4','Reg3a','Creb3l3','Cyp3a13','Cyp2d26','Ms4a10','Ace','Aldh1a1','Rdh7','H2-Q2', 'Hsd17b6','Gstm3','Gda','Apoc3','Gpd1','Fabp1','Slc5a1','Mme','Cox7a1','Gsta4','Lct','Khk','Mttp','Xdh','Sult1b1', 'Treh','Lpgat1','Dhrs1','Cyp2c66','Ephx2','Cyp2c65','Cyp3a25','Slc2a2','Ugdh','Gstm6','Retsat','Ppap2a','Acsl5', 'Cyb5r3','Cyb5b','Ckmt1','Aldob','Ckb','Scp2','Prap1']
marker_genes['Enterocyte (Distal)'] = ['Tmigd1','Fabp6','Slc51b','Slc51a','Mep1a','Fam151a','Naaladl1','Slc34a2','Plb1','Nudt4','Dpep1','Pmp22','Xpnpep2','Muc3','Neu1','Clec2h','Phgr1','2200002D01Rik','Prss30','Cubn','Plec','Fgf15','Crip1','Krt20','Dhcr24','Myo15b','Amn','Enpep','Anpep','Slc7a9','Ocm','Anxa2','Aoc1','Ceacam20','Arf6','Abcb1a','Xpnpep1','Vnn1','Cndp2','Nostrin','Slc13a1','Aspa','Maf','Myh14']
marker_genes['Goblet'] = ['Agr2', 'Fcgbp', 'Tff3', 'Clca1', 'Zg16', 'Tpsg1', 'Muc2', 'Galnt12', 'Atoh1', 'Rep15', 'S100a6', 'Pdia5', 'Klk1', 'Pla2g10', 'Spdef', 'Lrrc26', 'Ccl9', 'Bace2', 'Bcas1', 'Slc12a8', 'Smim14', 'Tspan13', 'Txndc5', 'Creb3l4', 'C1galt1c1', 'Creb3l1', 'Qsox1', 'Guca2a', 'Scin', 'Ern2', 'AW112010', 'Fkbp11', 'Capn9', 'Stard3nl', 'Slc50a1', 'Sdf2l1', 'Hgfa', 'Galnt7', 'Hpd', 'Ttc39a', 'Tmed3', 'Pdia6', 'Uap1', 'Gcnt3', 'Tnfaip8', 'Dnajc10', 'Ergic1', 'Tsta3', 'Kdelr3', 'Foxa3', 'Tpd52', 'Tmed9', 'Spink4', 'Nans', 'Cmtm7', 'Creld2', 'Tm9sf3', 'Wars', 'Smim6', 'Manf', 'Oit1', 'Tram1', 'Kdelr2', 'Xbp1', 'Serp1', 'Vimp', 'Guk1', 'Sh3bgrl3', 'Cmpk1', 'Tmsb10', 'Dap', 'Ostc', 'Ssr4', 'Sec61b', 'Pdia3', 'Gale', 'Klf4', 'Krtcap2', 'Arf4', 'Sep15', 'Ssr2', 'Ramp1', 'Calr', 'Ddost']
marker_genes['Paneth'] = ['Gm15284', 'AY761184', 'Defa17', 'Gm14851', 'Defa22', 'Defa-rs1', 'Defa3', 'Defa24', 'Defa26', 'Defa21', 'Lyz1', 'Gm15292', 'Mptx2', 'Ang4']
marker_genes['Enteroendocrine'] = ['Chgb', 'Gfra3', 'Cck', 'Vwa5b2', 'Neurod1', 'Fev', 'Aplp1', 'Scgn', 'Neurog3', 'Resp18', 'Trp53i11', 'Bex2', 'Rph3al', 'Scg5', 'Pcsk1', 'Isl1', 'Maged1', 'Fabp5', 'Celf3', 'Pcsk1n', 'Fam183b', 'Prnp', 'Tac1', 'Gpx3', 'Cplx2', 'Nkx2-2', 'Olfm1', 'Vim', 'Rimbp2', 'Anxa6', 'Scg3', 'Ngfrap1', 'Insm1', 'Gng4', 'Pax6', 'Cnot6l', 'Cacna2d1', 'Tox3', 'Slc39a2', 'Riiad1']
marker_genes['Tuft'] = ['Alox5ap', 'Lrmp', 'Hck', 'Avil', 'Rgs13', 'Ltc4s', 'Trpm5', 'Dclk1', 'Spib', 'Fyb', 'Ptpn6', 'Matk', 'Snrnp25', 'Sh2d7', 'Ly6g6f', 'Kctd12', '1810046K07Rik', 'Hpgds', 'Tuba1a', 'Pik3r5', 'Vav1', 'Tspan6', 'Skap2', 'Pygl', 'Ccdc109b', 'Ccdc28b', 'Plcg2', 'Ly6g6d', 'Alox5', 'Pou2f3', 'Gng13', 'Bmx', 'Ptpn18', 'Nebl', 'Limd2', 'Pea15a', 'Tmem176a', 'Smpx', 'Itpr2', 'Il13ra1', 'Siglecf', 'Ffar3', 'Rac2', 'Hmx2', 'Bpgm', 'Inpp5j', 'Ptgs1', 'Aldh2', 'Pik3cg', 'Cd24a', 'Ethe1', 'Inpp5d', 'Krt23', 'Gprc5c', 'Reep5', 'Csk', 'Bcl2l14', 'Tmem141', 'Coprs', 'Tmem176b', '1110007C09Rik', 'Ildr1', 'Galk1', 'Zfp428', 'Rgs2', 'Inpp5b', 'Gnai2', 'Pla2g4a', 'Acot7', 'Rbm38', 'Gga2', 'Myo1b', 'Adh1', 'Bub3', 'Sec14l1', 'Asah1', 'Ppp3ca', 'Agt', 'Gimap1', 'Krt18', 'Pim3', '2210016L21Rik', 'Tmem9', 'Lima1', 'Fam221a', 'Nt5c3', 'Atp2a3', 'Mlip', 'Vdac3', 'Ccdc23', 'Tmem45b', 'Cd47', 'Lect2', 'Pla2g16', 'Mocs2', 'Arpc5', 'Ndufaf3']
# annotation
cell_annotation = sc.tl.marker_gene_overlap(adata,marker_genes, key = 'rank_genes_r0.5')
cell_annotation
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|
| Stem | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Enterocyte (Proximal) | 0.0 | 0.0 | 0.0 | 6.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| Enterocyte (Distal) | 0.0 | 0.0 | 2.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Goblet | 0.0 | 0.0 | 63.0 | 0.0 | 0.0 | 13.0 | 2.0 | 1.0 |
| Paneth | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 14.0 | 0.0 | 0.0 |
| Enteroendocrine | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 30.0 | 0.0 |
| Tuft | 0.0 | 1.0 | 2.0 | 0.0 | 0.0 | 3.0 | 2.0 | 65.0 |
cell_annotation_norm = sc.tl.marker_gene_overlap(adata, marker_genes, key = 'rank_genes_r0.5',
normalize = 'reference')
sb.heatmap(cell_annotation_norm,cbar = False, annot = True)
<Axes: >
# From the heatmap, cluster 4 express marker genes of Enterocyte(Proximal), they could be another subclusters of enterocyte
# Check the mt-frac
sc.pl.violin(adata,'mt_frac', groupby = 'lv_r0.5')
# Here cluster 4 have no signiture
# Here cluster 1,4 have no marker genes, but they are close to cluster 2 (stem cell) and 4(enterocyte) on umap
# check the enterocyte and stem marker genes expression
#Define a nice colour map for gene expression
colors2 = plt.cm.Reds(np.linspace(0, 1, 128))
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)
# collate all enterocyte markers
ids_entprox = np.in1d(adata.var_names, marker_genes['Enterocyte (Proximal)'])
ids_entdist = np.in1d(adata.var_names, marker_genes['Enterocyte (Distal)'])
ids_ent = np.logical_or(ids_entprox, ids_entdist)
# calculate mean expression and plot
adata.obs['Enterocyte_marker_expr'] = adata.X[:,ids_ent].mean(1)
sc.pl.violin(adata, 'Enterocyte_marker_expr', groupby='lv_r0.5')
sc.pl.umap(adata, color='Enterocyte_marker_expr', color_map = mymap)
# for stem cell
stem_genes = np.in1d(adata.var_names, marker_genes['Stem'])
adata.obs['Stem_marker_expr'] = adata.X[:,stem_genes].mean(1)
sc.pl.violin(adata, 'Stem_marker_expr', groupby='lv_r0.5')
sc.pl.umap(adata, color = 'Stem_marker_expr', color_map = mymap)
sc.pl.rank_genes_groups(adata, key = 'rank_genes_r0.5', groups = '1')
# From (J. M. Li et al., 2021), UBE2C can be marker of transit-amplifying cells. So annotate cluster 1 as TA.
# Cluster 4 shows express pattern like both stem and enterocyte, temporarily annotate cluster 5 as early enterocytes.
# Add annotation
adata.rename_categories('lv_r0.5',['Stem','TA','Goblet','Enterocyte','Enterocyte(early)','Paneth','Enteroendocrine','Tuft'])
# replot the cluster in umap
sc.pl.umap(adata, color = 'lv_r0.5', legend_loc = 'on data')
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/anndata/_core/anndata.py:1160: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. self.obs[key].cat.rename_categories(categories, inplace=True) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
#Subsetting to relevant clusters
clusters_to_include = [g for g in adata.obs['lv_r0.5'].cat.categories if (g.startswith('Enterocyte') or g.startswith('TA') or g.startswith('Stem'))]
adata_ent = adata[np.isin(adata.obs['lv_r0.5'], clusters_to_include),:].copy()
#Subset to highly variable genes
sc.pp.highly_variable_genes(adata_ent, flavor='cell_ranger', n_top_genes=4000, subset=True)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
finished (0:00:03)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
# recalculate PCA for subset
sc.pp.pca(adata_ent, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_ent)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:10)
# choose first 7 PC
adata_ent.obsm['X_pca'] = adata_ent.obsm['X_pca'][:,0:7]
# preprocessing for monocle
data_mat_mon = adata.layers['counts'].T
var_mon = adata.var.copy()
obs_mon = adata.obs.copy()
%%R -i data_mat_mon -i obs_mon -i var_mon
#Set up the CellDataSet data structure
pd <- AnnotatedDataFrame(data = obs_mon)
fd <- AnnotatedDataFrame(data = var_mon)
colnames(data_mat_mon) <- rownames(pd)
rownames(data_mat_mon) <- rownames(fd)
ie_regions_cds <- newCellDataSet(cellData=data_mat_mon, phenoData=pd, featureData=fd, expressionFamily=negbinomial.size())
#Normalize the count data
ie_regions_cds <- estimateSizeFactors(ie_regions_cds)
#Calculate dispersions to filter for highly variable genes
ie_regions_cds <- estimateDispersions(ie_regions_cds)
#Filter for Stem, Enterocytes(early), TA, and Enterocytes
cell_types = as.character(pData(ie_regions_cds)$lv_r0.5)
cell_mask = rep(FALSE, length(cell_types))
cells_to_keep = c("Stem", "Enterocyte(early)", "TA", "Enterocyte")
for (item in cells_to_keep) {cell_mask = cell_mask | startsWith(cell_types, item)}
print("Number of cells after filtering:")
print(sum(cell_mask))
print("")
#Filter highly variable genes from our analysis
hvg_mask = fData(ie_regions_cds)$highly_variable
ie_regions_cds <- ie_regions_cds[hvg_mask, cell_mask]
#Do dimensionality reduction
ie_regions_cds <- reduceDimension(ie_regions_cds, norm_method = 'vstExprs', reduction_method='DDRTree', verbose = F, max_components = 7)
#Run for the first time to get the ordering
ie_regions_cds <- orderCells(ie_regions_cds)
#Find the correct root state the corresponds to the 'Stem' cluster
tab1 <- table(pData(ie_regions_cds)$State, pData(ie_regions_cds)$lv_r0.5)
id = which(colnames(tab1) == 'Stem')
root_name = names(which.max(tab1[,id]))
#Run a second time to get the correct root state that overlaps with Stem cells
ie_regions_cds <- orderCells(ie_regions_cds, root_state=root_name)
[1] "Number of cells after filtering:" [1] 9477 [1] ""
Removing 29 outliers 此外: There were 11 warnings (use warnings() to see them)
%%R -w 1000 -h 800
#Get a nice colour map
custom_colour_map = brewer.pal(length(unique(pData(ie_regions_cds)$lv_r0.5)),'Paired')
#Find the correct root state that coresponds to the 'Stem' cluster
tab1 <- table(pData(ie_regions_cds)$State, pData(ie_regions_cds)$lv_r0.5)
id = which(colnames(tab1) == 'Stem')
root_name = names(which.max(tab1[,id]))
# Visualize with cluster labels
options(repr.plot.width=5, repr.plot.height=4)
plot_complex_cell_trajectory(ie_regions_cds, color_by = 'lv_r0.5', show_branch_points = T,
cell_size = 2, cell_link_size = 1, root_states = c(root_name)) +
scale_size(range = c(0.2, 0.2)) +
theme(legend.position="left", legend.title=element_blank(), legend.text=element_text(size=rel(1.5))) +
guides(colour = guide_legend(override.aes = list(size=6))) +
scale_color_manual(values = custom_colour_map)
%%R -w 600 -h 800
#Visualize pseudotime found
options(repr.plot.width=5, repr.plot.height=4)
plot_cell_trajectory(ie_regions_cds, color_by="Pseudotime")